dir = list.dirs("agneesh/all_data/kallisto_results/", full.names = T, recursive = F)
filename = file.path(dir, "abundance.tsv")
names <- data.frame(name = filename) %>% separate(name, sep = "//", into = c(NA,"test")) %>% separate(test, sep = "/", into = c("name", NA)) %>% pull(name)
pm <- tibble(target_id = character(), est_counts = numeric(), tpm = numeric(), library = character())
for (j in seq(filename)) pm <- rbind(pm, read_tsv(filename[j]) %>% select(target_id, est_counts, tpm) %>% mutate(library = names[j]))
genes <- read_tsv("ref/genes.txt", col_names = c("target_id", "gene"))
taiwan_venom <- read_rds("data/pm.RDS") %>% filter(grepl("Pm", library)) %>% left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(gene) %>% summarize(tpm = sum(tpm))
taiwan_proteome <- read_csv("data/taiwan-secreted.csv", col_types = "-nc", col_names = c("gene", "type"))
rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan1"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan11"),
read_tsv("data/2-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan2"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(library = "taiwan3")) %>%
left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(library, gene) %>% summarize(tpm = sum(tpm), est_counts = sum(est_counts)) -> taiwan
averaged_data <- left_join(taiwan %>% filter(est_counts > 3) %>%
group_by(gene) %>% summarize(tpm = mean(tpm), est_counts = mean(est_counts)), taiwan_venom %>% group_by(gene) %>% summarize(tpm = mean(tpm)), by = "gene")
with(averaged_data, cor.test(tpm.x, tpm.y, method = "s"))
## Warning in cor.test.default(tpm.x, tpm.y, method = "s"): Cannot compute exact p-
## value with ties
##
## Spearman's rank correlation rho
##
## data: tpm.x and tpm.y
## S = 5.9513e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3908262
tp1 <- averaged_data %>% ggplot(aes(tpm.y, tpm.x)) + geom_hex() + scale_y_log10(labels=trans_format('log10',math_format(10^.x))) + scale_x_log10(labels=trans_format('log10',math_format(10^.x))) + theme_bw() + xlab("Venom gland RNA (TPM)") + ylab("Venom RNA (TPM)") + labs(color=expression('Log'[10]*" counts")) + guides(fill = "none")
tp1
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 89 rows containing non-finite values (stat_binhex).
taiwan_merged_venom <- left_join(filter(taiwan, gene %in% taiwan_proteome$gene), taiwan_venom, by = "gene") %>% left_join(taiwan_proteome)
taiwan_merged_venom %>% group_by(library) %>% summarize(cor = cor(tpm.x, tpm.y), method = "s")
left_join(filter(taiwan, est_counts >= 1 & ! gene %in% taiwan_proteome$gene), taiwan_venom, by = "gene") %>% group_by(library) %>% summarize(cor = cor(tpm.x, tpm.y), method = "s")
p1 <- taiwan_merged_venom %>% na.omit() %>% filter(est_counts >= 1) %>% ggplot(aes(tpm.y, tpm.x, shape = library, color = type)) + geom_line(aes(group = gene), color = "grey") + geom_point() + scale_y_log10(labels=trans_format('log10',math_format(10^.x))) +scale_x_log10(labels=trans_format('log10',math_format(10^.x))) + theme_bw() + xlab("Average venom gland gene expression (TPM)") + ylab("Venom mRNA level (TPM)") + guides(shape = "none") + ggtitle("Protobothrops mucrosquamatus") + theme(plot.title = element_text(face = "italic"))
p1
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
#taiwan_merged_venom %>% na.omit() %>% filter(est_counts >= 1) %>% ggplot(aes(tpm.y, tpm.x, shape = library, color = type)) + geom_line(aes(group = gene), color = "grey") + geom_point() + theme_bw() + xlab("Average venom gland gene expression (TPM)") + ylab("Venom mRNA level (TPM)") + guides(shape = "none") + ggtitle("Protobothrops mucrosquamatus") + theme(plot.title = element_text(face = "italic")) + annotation_custom(grob = ggplotGrob(tp1, xmin = 1, xmax = 2, ymin = 3, ymax = 4)) + scale_y_log10(labels=trans_format('log10',math_format(10^.x))) +scale_x_log10(labels=trans_format('log10',math_format(10^.x)))
#p1_1 <- p1 + annotation_custom(grob = ggplotGrob(tp1 + theme(axis.title=element_blank(),
#) ),
#xmin = .7, xmax = 3, ymin = 2, ymax = 5)
taiwan %>% filter(tpm > 1) %>%
select(-est_counts) %>%
mutate(tpm = log(1+tpm)) %>% pivot_wider(names_from = library, values_from = tpm) %>%
select(-taiwan2) %>%
filter(!if_any(everything(), is.na)) %>%
ggpairs(columns = 2:4)
# Where are venom toxins?
Pm_tpm_exp<-rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan1"),
read_tsv("data/2-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan2"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(id="taiwan3"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(id="taiwan11")) %>%
dplyr::select(target_id,tpm,id) %>% spread(id,tpm)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## target_id = col_character(),
## length = col_double(),
## eff_length = col_double(),
## est_counts = col_double(),
## tpm = col_double()
## )
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## target_id = col_character(),
## length = col_double(),
## eff_length = col_double(),
## est_counts = col_double(),
## tpm = col_double()
## )
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## target_id = col_character(),
## length = col_double(),
## eff_length = col_double(),
## est_counts = col_double(),
## tpm = col_double()
## )
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## target_id = col_character(),
## length = col_double(),
## eff_length = col_double(),
## est_counts = col_double(),
## tpm = col_double()
## )
pm<-readRDS("./data/pm.RDS")
pm<-pm %>% dplyr::select(-est_counts) %>% spread(library,tpm)
pm_tpm<-cbind(Pm_tpm_exp,pm[14:41])
dat<-pm_tpm %>% group_by(target_id) %>%
summarise(venom_gland=mean(Pm_10:Pm_9),venom_rna = mean(taiwan1:taiwan3)) %>%
filter(venom_gland >1,venom_rna>1) #filter to remove low exp genes hugging the axis to improve readability
toxins<-read.csv("./data/toxins.csv")
dat_toxins<-dat %>%filter(target_id %in% toxins$Transcripts)
dat_toxins %>% ggplot()+
geom_point(aes(x=target_id, y=venom_rna,color="venom"))+
geom_line(aes(x=target_id,y=venom_rna,group =1), color="#6B1EE1")+
geom_point(aes(x=target_id, y=venom_gland,color="gland"))+
geom_line(aes(x=target_id,y=venom_gland,group=1), color="#94E11E")+
scale_color_manual(values = c("#94E11E","#6B1EE1"))+
scale_y_log10()+
theme_classic()
cor(dat_toxins$venom_gland,dat_toxins$venom_rna, method = 's')
## [,1]
## [1,] 0.6252115
toxins<-read.csv("./data/toxins.csv")
toxins<-rename(toxins, target_id=Transcripts)
allu_dat<-inner_join(dat_toxins,toxins,"target_id") %>% select(venom_gland, venom_rna, short) %>% rename(Toxin=short)
allu_dat<-allu_dat %>% gather(Toxin) %>% mutate(toxin = rep(allu_dat$Toxin,2)) %>% rename(tissue=Toxin)
toxins<-c("SVMP","SVSP","CTL","PLA2","LAAO","CRISP","VEGF","BPP","NGF")
allu_dat<-allu_dat %>% filter(toxin %in% toxins)
ggplot(allu_dat, aes(y= log(value),
axis1 = toxin,
axis2 = tissue))+
geom_alluvium(aes(fill = toxin))+
geom_stratum()+
geom_text(stat = "stratum",
aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("toxin", "tissue"),
expand = c(0.15, 0.05))+
scale_fill_viridis_d()+
theme_void()
taiwan_venom2 <- taiwan_venom <- read_rds("data/pm.RDS") %>% left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(gene, library) %>% summarize(tpm = sum(tpm))
## `summarise()` has grouped output by 'gene'. You can override using the `.groups` argument.
taiwan_merged2 <- rbind(taiwan %>% filter(tpm >= 2) %>% select(-est_counts), taiwan_venom2) %>%
pivot_wider(names_from = library, values_from = tpm)
d <- taiwan_merged2 %>% ungroup %>% select(-gene, -P_m_05_heart, -taiwan2)
d <- d[sort(rowSums(d, na.rm = T), decreasing = T, index.return = T)[[2]][1:5000],]
d <- log(d+1) %>% t() %>% dist()
fit <- MASS::isoMDS(d, k=2)
## initial value 24.091138
## final value 24.072651
## converged
plotDat <- data.frame(x = fit$points[,1], y = fit$points[,2], tissue = c(rep("venom",3), rep(c("heart", "kidney", "liver"),2), c("kidney", "liver"), c("heart", "kidney", "liver"), rep("venom gland", 28)) )
centroids <- aggregate(cbind(x,y) ~ tissue, plotDat, mean)
conf.rgn <- do.call(rbind, lapply(unique(plotDat$tissue), function(t)
data.frame(tissue = as.character(t),
ellipse(cov(plotDat[plotDat$tissue == t, 1:2]),
centre=as.matrix(centroids[centroids$tissue == t,2:3]),
level=0.95),
stringsAsFactors=FALSE)))
tp2 <- ggplot(plotDat, aes(x,y,color = tissue)) + geom_point(size=3) + theme_bw() + geom_path(data=conf.rgn) + xlab("NMDS Axis 1") + ylab("NMDS Axis 2")+ theme(legend.position = 'bottom')
tp2
#g <- grid.arrange(tp1, tp2, ncol = 1)
#ggsave(file="plots/Figure 2 taiwan.pdf", g, width = 5, height = 10)
As expected given the lower coverage the venom RNA nests with the venom gland samples
modules <- read_csv(file = "data/modules.csv")
taiwan_venom_matrix <- rbind(read_rds("data/pm.RDS") %>% filter(grepl("Pm", library)) %>% select(-est_counts),
rbind(read_tsv("data/1-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan1"),
read_tsv("data/11-Taiwan-Female/abundance.tsv") %>% mutate(library = "taiwan11"),
read_tsv("data/3-Taiwan-Male/abundance.tsv") %>% mutate(library = "taiwan3")) %>%
filter(est_counts > 1) %>% select(target_id, library, tpm) %>%
mutate(tpm = log(1+tpm)) %>% select(library, target_id, tpm)) %>%
pivot_wider(names_from = target_id, values_from = tpm)
taiwan_venom_matrix2 <- taiwan_venom_matrix[,intersect(modules$target_id, colnames(taiwan_venom_matrix)[colSums(is.na(taiwan_venom_matrix)) == 0])]
gsg <- goodSamplesGenes(taiwan_venom_matrix2)
gsg$allOK
goodModules <- modules %>% filter(target_id %in% colnames(taiwan_venom_matrix2)) %>% group_by(colors) %>% summarize(count = n() > 200) %>% filter(count == T) %>% pull(colors) # filter out small modules
keep <- intersect(modules %>% filter(colors %in% goodModules) %>% pull(target_id), colnames(taiwan_venom_matrix2))
setLabels <- c("old", "new");
multiExpr <- list(old = list(data = taiwan_venom_matrix2[1:28, keep]), new = list(data = taiwan_venom_matrix2[28:30, keep]))
multiColor <- list(old = modules %>% filter(target_id %in% keep) %>% pull(colors))
allowWGCNAThreads(10)
mp <- modulePreservation(multiExpr, multiColor,
parallelCalculation = T,
referenceNetworks = 1,
checkData = F,
nPermutations = 200,
randomSeed = 1,
verbose = 3)
mp<-readRDS("./data/module_preservation.rds")
mp$preservation$Z$ref.old$inColumnsAlsoPresentIn.new %>% mutate(p = 10^mp$preservation$log.p$ref.old$inColumnsAlsoPresentIn.new$log.psummary.pres) %>% select (moduleSize, Zsummary.pres, p)
#write_rds(mp, "data/module_preservation.rds")
The metavenom module is highly preserved in the venom RNA
#Human-habu_modules from orgin of specialisation study
habu_human_modules<-read_csv("./data/all_data_tpmKeep_modules.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## gene_symbol = col_character(),
## colors = col_character(),
## ens_gene_human = col_character(),
## habu_target_id = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
#get muscle module
habu_human_muscle<-habu_human_modules %>% filter(colors == "yellow")
#filter the muscle data in vRNA gRNA matrix
pm_tpm_muscle<-pm_tpm %>% filter(target_id %in% habu_human_muscle$habu_target_id)
dat_muscle<-pm_tpm_muscle %>% group_by(target_id) %>% summarise(venom_rna=mean(taiwan1:taiwan3),venom_gland=mean(Pm_10:Pm_9))
cor(dat_muscle$venom_rna,dat_muscle$venom_gland,method = 's')
## [,1]
## [1,] 0.4764015
#lower correlation for muscle than venom
dat_muscle %>% ggplot()+
geom_point(aes(x=target_id, y=venom_rna,color="venom"))+
geom_point(aes(x=target_id, y=venom_gland,color="gland"))+
scale_color_manual(values = c("#94E11E","#6B1EE1"))+
scale_y_log10()+
ylab("RNA abundance (logTPM)")+
theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
#Most mucle abundance is in vgRNA. Comparatively, vRNA has lower amount of muscle contaminants.
taiwan_venom2 <- taiwan_venom <- read_rds("data/pm.RDS") %>% left_join(genes, by = "target_id") %>% na.omit() %>%
group_by(gene, library) %>% summarize(tpm = sum(tpm))
## `summarise()` has grouped output by 'gene'. You can override using the `.groups` argument.
taiwan_merged2 <- rbind(taiwan %>% filter(tpm >= 2) %>% select(-est_counts), taiwan_venom2) %>%
pivot_wider(names_from = library, values_from = tpm)
taiwan_merged2<-taiwan_merged2 %>% filter(!gene %in% habu_human_muscle$habu_gene_id)
d <- taiwan_merged2 %>% ungroup %>% select(-gene, -P_m_05_heart, -taiwan2)
d <- d[sort(rowSums(d, na.rm = T), decreasing = T, index.return = T)[[2]][1:5000],]
d <- log(d+1) %>% t() %>% dist()
fit <- MASS::isoMDS(d, k=2)
## initial value 21.796921
## final value 21.777990
## converged
plotDat <- data.frame(x = fit$points[,1], y = fit$points[,2], tissue = c(rep("venom",3), rep(c("heart", "kidney", "liver"),2), c("kidney", "liver"), c("heart", "kidney", "liver"), rep("venom gland", 28)) )
centroids <- aggregate(cbind(x,y) ~ tissue, plotDat, mean)
conf.rgn <- do.call(rbind, lapply(unique(plotDat$tissue), function(t)
data.frame(tissue = as.character(t),
ellipse(cov(plotDat[plotDat$tissue == t, 1:2]),
centre=as.matrix(centroids[centroids$tissue == t,2:3]),
level=0.95),
stringsAsFactors=FALSE)))
tp2 <- ggplot(plotDat, aes(x,y,color = tissue)) + geom_point(size=3) + theme_bw() + geom_path(data=conf.rgn) + xlab("NMDS Axis 1") + ylab("NMDS Axis 2")+ theme(legend.position = 'bottom')
tp2
pm_tpm %>% group_by(target_id) %>% summarise(vRNA = mean(taiwan1:taiwan3),vgRNA = mean(Pm_10:Pm_9)) %>% filter(vRNA > 2 & vgRNA >2)->model_dat
fit<-lm(log10(model_dat$vgRNA)~log10(model_dat$vRNA))
summary(fit)
##
## Call:
## lm(formula = log10(model_dat$vgRNA) ~ log10(model_dat$vRNA))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10589 -0.34591 0.00843 0.32871 2.44065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.59561 0.01723 34.57 <2e-16 ***
## log10(model_dat$vRNA) 0.51516 0.01056 48.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5151 on 5175 degrees of freedom
## Multiple R-squared: 0.3152, Adjusted R-squared: 0.315
## F-statistic: 2381 on 1 and 5175 DF, p-value: < 2.2e-16
plot(fit)
ggplot(fit,aes(fit$residuals))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_dat %>%
ggplot(aes(x=log10(vgRNA), y=log10(vRNA)))+
geom_point()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
#model diagnostic plots how that the model has good properties
okinawa_old <- read_tsv("old/okinawa.txt") # protein data from Aird et al.
rbind(read_tsv("data/7-Okinawa-Male/abundance.tsv") %>% mutate(id = "okinawa7"),
read_tsv("data/8-Okinawa-Female/abundance.tsv") %>% mutate(id = "okinawa8"),
read_tsv("data/9-Okinawa-Male/abundance.tsv") %>% mutate(id = "okinawa9") ) %>%
left_join(read_tsv("data/okinawa_ref/abundance.tsv") %>% mutate(ref = tpm) %>% select(ref, target_id), by = c("target_id")) -> okinawa
okinawa_merged <- left_join(okinawa_old, okinawa, by =c("Pf" = "target_id"))
okinawa_merged %>% group_by(id) %>% filter(est_counts >= 0) %>% summarize(cor = cor(tpm, ref), method = "s")
cor.test(okinawa_old$fpkm, okinawa_old$`frag/len`)
##
## Pearson's product-moment correlation
##
## data: okinawa_old$fpkm and okinawa_old$`frag/len`
## t = 2.6585, df = 102, p-value = 0.009114
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.06516617 0.42625081
## sample estimates:
## cor
## 0.2545596
okinawa_merged %>% group_by(id) %>% filter(est_counts >= 0) %>% summarize(cor = cor(tpm, `frag/len`), method = "s")
p2 <- okinawa_merged %>% filter(est_counts >= 1) %>% ggplot(aes(ref, tpm, color = id)) + geom_line(aes(group = Pf), color = "grey")+ geom_point() + scale_y_log10(labels=trans_format('log10',math_format(10^.x))) +scale_x_log10(labels=trans_format('log10',math_format(10^.x))) + theme_bw() + xlab("Venom gland gene expression (TPM)") + ylab("Venom mRNA level (TPM)") + guides(color = F) + ggtitle("Protobothrops flavoviridis") + theme(plot.title = element_text(face = "italic"))
p2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
pf_toxins<-read_csv("./data/Protobothrops-Table 3.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ID = col_character(),
## `Toxin Class` = col_character(),
## FPKM = col_number(),
## `% FPKM` = col_character(),
## Summary = col_character()
## )
pf_venom_toxins<-left_join(okinawa_merged %>% filter(Pf %in% pf_toxins$ID) %>% select(Pf,tpm,id) %>% spread(id,tpm),
okinawa_merged %>% select(Pf,ref) %>% group_by(Pf) %>% summarise(ref=mean(ref))) #dataset with 57 toxins (>1 est_count)
## Joining, by = "Pf"
cor(pf_venom_toxins %>% group_by(Pf) %>% summarise(mean_tpm=mean(okinawa7:okinawa9)) %>% select(mean_tpm),pf_venom_toxins$ref, method = 's')
## [,1]
## mean_tpm 0.7022297
#toxin vRNA and vgRNA highly correlated
dat_toxins<-cbind(pf_venom_toxins$Pf,pf_venom_toxins %>% group_by(Pf) %>% summarise(mean_tpm=mean(okinawa7:okinawa9)) %>% select(mean_tpm),pf_venom_toxins$ref) %>% rename(target_id=`pf_venom_toxins$Pf`,venom_rna=mean_tpm,venom_gland=`pf_venom_toxins$ref`)
dat_toxins<-left_join(dat_toxins,pf_toxins %>% rename(target_id=ID), by='target_id') %>% rename(toxin_class=`Toxin Class`)
dat_toxins %>% ggplot()+
geom_point(aes(x=toxin_class, y=venom_rna,color="venom"))+
geom_line(aes(x=toxin_class,y=venom_rna,group =1), color="#6B1EE1")+
geom_point(aes(x=toxin_class, y=venom_gland,color="gland"))+
geom_line(aes(x=toxin_class,y=venom_gland,group=1), color="#94E11E")+
scale_color_manual(values = c("#94E11E","#6B1EE1"))+
scale_y_log10()+
ylab("RNA abundance (logTPM)")+
theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis